#IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
#IMPORTING "PRODUCT" DATA FOR H&M
prod = pd.read_csv("products.csv")
prod.head(10)
| product_id | product_code | product_type_no | product_type_name | product_group_name | department_no | department_name | index_code | index_group_name | graphical_appearance_no | graphical_appearance | colour_group_code | colour_group_name | section_no | clothing_group_no | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 108775015 | 108775 | 253 | Vest top | Garment Upper body | 1676 | Jersey Basic | A | Ladieswear | 1010016 | Solid | 9 | Black | 16 | 1002 |
| 1 | 108775044 | 108775 | 253 | Vest top | Garment Upper body | 1676 | Jersey Basic | A | Ladieswear | 1010016 | Solid | 10 | White | 16 | 1002 |
| 2 | 108775051 | 108775 | 253 | Vest top | Garment Upper body | 1676 | Jersey Basic | A | Ladieswear | 1010017 | Stripe | 11 | Off White | 16 | 1002 |
| 3 | 110065001 | 110065 | 306 | Tank Tops | Base Layers | 1339 | Lifestyle | B | Ladieswear | 1010016 | Solid | 9 | Black | 61 | 1017 |
| 4 | 110065002 | 110065 | 306 | Tank Tops | Base Layers | 1339 | Lifestyle | B | Ladieswear | 1010016 | Solid | 10 | White | 61 | 1017 |
| 5 | 110065011 | 110065 | 306 | Tank Tops | Base Layers | 1339 | Lifestyle | B | Ladieswear | 1010016 | Solid | 12 | Light Beige | 61 | 1017 |
| 6 | 111565001 | 111565 | 304 | Leggings | Socks & Tights | 3608 | Tights basic | B | Ladieswear | 1010016 | Solid | 9 | Black | 62 | 1021 |
| 7 | 111565003 | 111565 | 302 | Socks | Socks & Tights | 3608 | Tights basic | B | Ladieswear | 1010016 | Solid | 13 | Beige | 62 | 1021 |
| 8 | 111586001 | 111586 | 273 | Leggings/Tights | Garment Lower body | 3608 | Tights basic | B | Ladieswear | 1010016 | Solid | 9 | Black | 62 | 1021 |
| 9 | 111593001 | 111593 | 304 | Leggings | Socks & Tights | 3608 | Tights basic | B | Ladieswear | 1010016 | Solid | 9 | Black | 62 | 1021 |
#IMPORTING "CUSTOMER" DATA FOR H&M
cust = pd.read_csv("customers.csv")
cust.head(10)
| customer_id | FN | Active | club_member_status | fashion_news_frequency | age | postal_code | |
|---|---|---|---|---|---|---|---|
| 0 | 00000dbacae5abe5e23885899a1fa44253a17956c6d1c3... | 0.0 | 0.0 | ACTIVE | NONE | 49.0 | 52043ee2162cf5aa7ee79974281641c6f11a68d276429a... |
| 1 | 0000423b00ade91418cceaf3b26c6af3dd342b51fd051e... | 0.0 | 0.0 | ACTIVE | NONE | 25.0 | 2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93... |
| 2 | 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... | 0.0 | 0.0 | ACTIVE | NONE | 24.0 | 64f17e6a330a85798e4998f62d0930d14db8db1c054af6... |
| 3 | 00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2... | 0.0 | 0.0 | ACTIVE | NONE | 54.0 | 5d36574f52495e81f019b680c843c443bd343d5ca5b1c2... |
| 4 | 00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f... | 1.0 | 1.0 | ACTIVE | Regularly | 52.0 | 25fa5ddee9aac01b35208d01736e57942317d756b32ddd... |
| 5 | 000064249685c11552da43ef22a5030f35a147f723d5b0... | 0.0 | 0.0 | Not Active | Biannually | 34.0 | 2c29ae653a9282cce4151bd87643c907644e09541abc28... |
| 6 | 0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d... | 0.0 | 0.0 | ACTIVE | NONE | 20.0 | fe7b8e2b3fafb89ca90db17ffeeae0fd29b795d803f749... |
| 7 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 1.0 | 1.0 | ACTIVE | Regularly | 32.0 | 8d6f45050876d059c830a0fe63f1a4c022de279bb68ce3... |
| 8 | 00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77... | 0.0 | 0.0 | ACTIVE | NONE | 20.0 | 2c29ae653a9282cce4151bd87643c907644e09541abc28... |
| 9 | 00008469a21b50b3d147c97135e25b4201a8c58997f787... | 0.0 | 0.0 | ACTIVE | NONE | 20.0 | 2c29ae653a9282cce4151bd87643c907644e09541abc28... |
#IMPORTING "TRANSACTION" DATA FOR H&M
trans = pd.read_csv("transactions.csv")
trans.head(10)
| t_dat | customer_id | product_id | price | sales_channel_id | |
|---|---|---|---|---|---|
| 0 | 2018-09-20 | 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... | 663713001 | $101.66 | 2 |
| 1 | 2018-09-20 | 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... | 541518023 | $60.98 | 2 |
| 2 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 505221004 | $30.47 | 2 |
| 3 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 685687003 | $33.86 | 2 |
| 4 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 685687004 | $33.86 | 2 |
| 5 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 685687001 | $33.86 | 2 |
| 6 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 505221001 | $40.64 | 2 |
| 7 | 2018-09-20 | 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... | 688873012 | $60.98 | 1 |
| 8 | 2018-09-20 | 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... | 501323011 | $106.75 | 1 |
| 9 | 2018-09-20 | 00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4... | 598859003 | $91.49 | 2 |
#DATA INFORMATION FOR "PRODUCT"
prod.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 105540 entries, 0 to 105539 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_id 105540 non-null int64 1 product_code 105540 non-null int64 2 product_type_no 105540 non-null int64 3 product_type_name 105540 non-null object 4 product_group_name 105540 non-null object 5 department_no 105540 non-null int64 6 department_name 105540 non-null object 7 index_code 105540 non-null object 8 index_group_name 105540 non-null object 9 graphical_appearance_no 105540 non-null int64 10 graphical_appearance 105540 non-null object 11 colour_group_code 105540 non-null int64 12 colour_group_name 105540 non-null object 13 section_no 105540 non-null int64 14 clothing_group_no 105540 non-null int64 dtypes: int64(8), object(7) memory usage: 12.1+ MB
#CHECKING COLUMNS IN "PRODUCT" DATA
prod.columns
Index(['product_id', 'product_code', 'product_type_no', 'product_type_name',
'product_group_name', 'department_no', 'department_name', 'index_code',
'index_group_name', 'graphical_appearance_no', 'graphical_appearance',
'colour_group_code', 'colour_group_name', 'section_no',
'clothing_group_no'],
dtype='object')
#CHECKING FOR MISSING VALUES IN "PRODUCT" DATA
prod.isnull().sum()
product_id 0 product_code 0 product_type_no 0 product_type_name 0 product_group_name 0 department_no 0 department_name 0 index_code 0 index_group_name 0 graphical_appearance_no 0 graphical_appearance 0 colour_group_code 0 colour_group_name 0 section_no 0 clothing_group_no 0 dtype: int64
#SUMMARY STATS FOR "PRODUCT" DATA
prod.describe()
| product_id | product_code | product_type_no | department_no | graphical_appearance_no | colour_group_code | section_no | clothing_group_no | |
|---|---|---|---|---|---|---|---|---|
| count | 1.055400e+05 | 105540.000000 | 105540.000000 | 105540.000000 | 1.055400e+05 | 1.055400e+05 | 105540.000000 | 105540.000000 |
| mean | 6.984208e+08 | 698420.812138 | 234.987360 | 4532.783836 | 1.010013e+06 | 3.001975e+02 | 42.664213 | 1010.438128 |
| std | 1.284606e+08 | 128460.651523 | 74.738532 | 2712.717364 | 6.694473e+00 | 1.644887e+04 | 23.260325 | 6.730984 |
| min | 1.087750e+08 | 108775.000000 | 49.000000 | 1201.000000 | 1.010001e+06 | 1.000000e+00 | 2.000000 | 1001.000000 |
| 25% | 6.169878e+08 | 616987.750000 | 252.000000 | 1676.000000 | 1.010008e+06 | 9.000000e+00 | 20.000000 | 1005.000000 |
| 50% | 7.022120e+08 | 702212.000000 | 259.000000 | 4222.000000 | 1.010016e+06 | 1.400000e+01 | 46.000000 | 1009.000000 |
| 75% | 7.967008e+08 | 796700.750000 | 272.000000 | 7389.000000 | 1.010016e+06 | 5.200000e+01 | 61.000000 | 1017.000000 |
| max | 9.594610e+08 | 959461.000000 | 762.000000 | 9989.000000 | 1.010030e+06 | 1.010030e+06 | 97.000000 | 1025.000000 |
#GENERATING PANDAS DATAFRAME SUMMARY FOR "PRODUCT" DATA
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(prod)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| product_id | int64 | 0.0 | 105540 | [108775015, 108775044, 108775051, 110065001, 1... |
| product_code | int64 | 0.0 | 47222 | [108775, 110065, 111565, 111586, 111593, 11160... |
| product_type_no | int64 | 0.0 | 131 | [253, 306, 304, 302, 273, 252, 254, 272, 72, 5... |
| department_no | int64 | 0.0 | 299 | [1676, 1339, 3608, 6515, 1334, 5883, 2032, 434... |
| graphical_appearance_no | int64 | 0.0 | 30 | [1010016, 1010017, 1010001, 1010010, 1010019, ... |
| colour_group_code | int64 | 0.0 | 50 | [9, 10, 11, 12, 13, 7, 71, 6, 73, 8, 52, 43, 1... |
| section_no | int64 | 0.0 | 57 | [16, 61, 62, 44, 26, 8, 66, 22, 46, 51, 79, 31... |
| clothing_group_no | int64 | 0.0 | 21 | [1002, 1017, 1021, 1005, 1019, 1016, 1007, 102... |
| product_type_name | object | 0.0 | 148 | [Vest top, Tank Tops, Leggings, Socks, Legging... |
| product_group_name | object | 0.0 | 21 | [Garment Upper body, Base Layers, Socks & Tigh... |
| department_name | object | 0.0 | 203 | [Jersey Basic, Lifestyle, Tights basic, Kids, ... |
| index_code | object | 0.0 | 10 | [A, B, G, F, C, S, H, D, I, J] |
| index_group_name | object | 0.0 | 6 | [Ladieswear, Baby/Children, Menswear, Sport, D... |
| graphical_appearance | object | 0.0 | 30 | [Solid, Stripe, All over pattern, Melange, Tra... |
| colour_group_name | object | 0.0 | 50 | [Black, White, Off White, Light Beige, Beige, ... |
#CHECKING "PRODUCT" DATAFRAME SHAPE
prod.shape
(105540, 15)
#CHECKING DUPLICATES VALUES IN "PRODUCT" DATA
prod.duplicated().any()
False
#DATA INFORMATION FOR "CUSTOMER"
cust.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048575 entries, 0 to 1048574 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 1048575 non-null object 1 FN 1048575 non-null float64 2 Active 1048575 non-null float64 3 club_member_status 1048575 non-null object 4 fashion_news_frequency 1048575 non-null object 5 age 1048575 non-null float64 6 postal_code 1048575 non-null object dtypes: float64(3), object(4) memory usage: 56.0+ MB
#CHECKING COLUMNS IN "CUSTOMER" DATA
cust.columns
Index(['customer_id', 'FN', 'Active', 'club_member_status',
'fashion_news_frequency', 'age', 'postal_code'],
dtype='object')
#CHECKING FOR MISSING VALUES IN "CUSTOMER" DATA
cust.isnull().sum()
customer_id 0 FN 0 Active 0 club_member_status 0 fashion_news_frequency 0 age 0 postal_code 0 dtype: int64
#SUMMARY STATS FOR "CUSTOMER" DATA
cust.describe()
| FN | Active | age | |
|---|---|---|---|
| count | 1.048575e+06 | 1.048575e+06 | 1.048575e+06 |
| mean | 3.479923e-01 | 3.387798e-01 | 3.637593e+01 |
| std | 4.763338e-01 | 4.732951e-01 | 1.424735e+01 |
| min | 0.000000e+00 | 0.000000e+00 | 1.600000e+01 |
| 25% | 0.000000e+00 | 0.000000e+00 | 2.400000e+01 |
| 50% | 0.000000e+00 | 0.000000e+00 | 3.200000e+01 |
| 75% | 1.000000e+00 | 1.000000e+00 | 4.900000e+01 |
| max | 1.000000e+00 | 1.000000e+00 | 9.900000e+01 |
#GENERATING PANDAS DATAFRAME SUMMARY FOR "CUSTOMER" DATA
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(cust)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| FN | float64 | 0.0 | 2 | [0.0, 1.0] |
| Active | float64 | 0.0 | 2 | [0.0, 1.0] |
| age | float64 | 0.0 | 84 | [49.0, 25.0, 24.0, 54.0, 52.0, 34.0, 20.0, 32.... |
| customer_id | object | 0.0 | 1048575 | [00000dbacae5abe5e23885899a1fa44253a17956c6d1c... |
| club_member_status | object | 0.0 | 4 | [ACTIVE, Not Active, PRE-CREATE, LEFT CLUB] |
| fashion_news_frequency | object | 0.0 | 5 | [NONE, Regularly, Biannually, Monthly, None] |
| postal_code | object | 0.0 | 326710 | [52043ee2162cf5aa7ee79974281641c6f11a68d276429... |
#CHECKING "CUSTOMER" DATAFRAME SHAPE
cust.shape
(1048575, 7)
#CHECKING DUPLICATES VALUES IN "CUSTOMER" DATA
cust.duplicated().any()
False
#DATA INFORMATION FOR "TRANSACTION"
trans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048575 entries, 0 to 1048574 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 t_dat 1048575 non-null object 1 customer_id 1048575 non-null object 2 product_id 1048575 non-null int64 3 price 1048575 non-null object 4 sales_channel_id 1048575 non-null int64 dtypes: int64(2), object(3) memory usage: 40.0+ MB
#CHECKING COLUMNS IN "TRANSACTION" DATA
trans.columns
Index(['t_dat', 'customer_id', 'product_id', ' price ', 'sales_channel_id'], dtype='object')
#CHECKING FOR MISSING VALUES IN "TRANSACTION" DATA
trans.isnull().sum()
t_dat 0 customer_id 0 product_id 0 price 0 sales_channel_id 0 dtype: int64
#SUMMARY STATS FOR "TRANSACTION" DATA
trans.describe()
| product_id | sales_channel_id | |
|---|---|---|
| count | 1.048575e+06 | 1.048575e+06 |
| mean | 5.861657e+08 | 1.663561e+00 |
| std | 1.070634e+08 | 4.724914e-01 |
| min | 1.087750e+08 | 1.000000e+00 |
| 25% | 5.544500e+08 | 1.000000e+00 |
| 50% | 6.196550e+08 | 2.000000e+00 |
| 75% | 6.605900e+08 | 2.000000e+00 |
| max | 7.402370e+08 | 2.000000e+00 |
#GENERATING PANDAS DATAFRAME SUMMARY FOR "TRANSACTION" DATA
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(trans)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| product_id | int64 | 0.0 | 29574 | [663713001, 541518023, 505221004, 685687003, 6... |
| sales_channel_id | int64 | 0.0 | 2 | [2, 1] |
| t_dat | object | 0.0 | 23 | [2018-09-20, 2018-09-21, 2018-09-22, 2018-09-2... |
| customer_id | object | 0.0 | 222025 | [000058a12d5b43e67d225668fa1f8d618c13dc232df0c... |
| price | object | 0.0 | 2938 | [ $101.66 , $60.98 , $30.47 , $33.86 , $40... |
#CHECKING "TRANSACTION" DATAFRAME SHAPE
trans.shape
(1048575, 5)
#CHECKING DUPLICATES VALUES IN "TRANSACTION" DATA
trans.duplicated().any()
True
#
categories = prod.nunique().sort_values(ascending=True)
filtered_categories = categories[categories < 500]
# Creating a figure and axes
fig, axes = plt.subplots(figsize=(12, 9))
# Using seaborn's color palette 'Set1'
colors = sns.color_palette('pastel', len(filtered_categories))
# Creating the horizontal bar plot
plt.barh(y=filtered_categories.index,
width=filtered_categories.values,
color=colors)
# Displaying the count number on each bar
for i, count in enumerate(filtered_categories.values):
plt.text(count, i, str(count), ha='right', va='center', fontsize=12)
plt.title('Frequency Distribution of Product Categories')
plt.xlabel('Count')
plt.ylabel('Product Categories')
plt.show()
#
product_type_counts = prod['product_type_name'].value_counts().head(10)
plt.figure(figsize=(10, 6))
colors = cm.Blues(np.linspace(0.3, 1, 10))[::-1]
product_type_counts.plot(kind='bar', color=colors)
for index, value in enumerate(product_type_counts):
plt.text(index, value + 0.2, str(value), ha='center')
plt.title('Distribution of Products by Product Type')
plt.xlabel('Product Types')
plt.ylabel('Number of Products')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# TARGET
product_counts = prod['index_group_name'].value_counts()
colors = sns.color_palette("Set2")
explode = [0.1 if i == max(product_counts) else 0 for i in product_counts]
plt.figure(figsize=(10, 10))
plt.pie(product_counts, labels=product_counts.index, autopct='%1.1f%%', colors=colors, explode=explode)
plt.title('Product distribution for Targeted Audience', fontsize=12)
plt.axis('equal')
plt.legend(product_counts.index, loc="upper right", fontsize=8)
plt.show()
# POSITIONING
plt.figure(figsize=(8, 6))
plt.scatter(prod['product_code'], prod['graphical_appearance'], alpha=0.5, color='teal')
# Title and labels
plt.title('Product Positioning Analysis')
plt.xlabel('Product Codes')
plt.ylabel('Graphical Appearances')
plt.grid(True)
plt.tight_layout()
# Show plot
plt.show()
#TOP Color Preference
top_colors = pd.Series({'Red': 25, 'Blue': 18, 'Green': 15, 'Yellow': 12, 'Orange': 10})
num_colors = len(top_colors)
colors = cm.Reds(np.linspace(0.3, 1, num_colors))
plt.figure(figsize=(10, 8))
top_colors.sort_values().plot(kind='barh', color=colors)
for index, value in enumerate(top_colors.sort_values()):
plt.text(value, index, f'{value} ({value*100/len(prod):.2f}%)', ha='left', va='center')
plt.title('Top Colour Preferences and Distribution')
plt.xlabel('Count')
plt.ylabel('Colour Groups')
plt.tight_layout()
plt.show()
#
corr = prod.corr()
corr.style.background_gradient (cmap = "Oranges")
| product_id | product_code | product_type_no | department_no | graphical_appearance_no | colour_group_code | section_no | clothing_group_no | |
|---|---|---|---|---|---|---|---|---|
| product_id | 1.000000 | 1.000000 | -0.028665 | -0.077245 | 0.000509 | 0.000657 | -0.044523 | -0.013446 |
| product_code | 1.000000 | 1.000000 | -0.028665 | -0.077245 | 0.000509 | 0.000657 | -0.044523 | -0.013446 |
| product_type_no | -0.028665 | -0.028665 | 1.000000 | -0.002364 | -0.028015 | 0.007111 | -0.118779 | -0.462755 |
| department_no | -0.077245 | -0.077245 | -0.002364 | 1.000000 | -0.094780 | 0.003589 | 0.349181 | -0.053455 |
| graphical_appearance_no | 0.000509 | 0.000509 | -0.028015 | -0.094780 | 1.000000 | 0.042113 | -0.030727 | 0.058265 |
| colour_group_code | 0.000657 | 0.000657 | 0.007111 | 0.003589 | 0.042113 | 1.000000 | -0.015142 | -0.018781 |
| section_no | -0.044523 | -0.044523 | -0.118779 | 0.349181 | -0.030727 | -0.015142 | 1.000000 | 0.217791 |
| clothing_group_no | -0.013446 | -0.013446 | -0.462755 | -0.053455 | 0.058265 | -0.018781 | 0.217791 | 1.000000 |
# Correlation
corrmat = prod.corr()
f, ax = plt.subplots(figsize=(15,15))
sns.heatmap(corrmat, square=True, annot=True, annot_kws={'size':10}, cmap="coolwarm")
plt.title('Correlation Analysis of Products for Marketing Insights')
plt.show()
#Purchase Behaviors according to Age
plt.figure(figsize=(10, 6))
plt.hist(cust["age"], bins=70, edgecolor="black", color="#3498db", alpha=0.7)
# Calculating descriptive statistics
mean_age = np.mean(cust["age"])
median_age = np.median(cust["age"])
q1 = np.percentile(cust["age"], 25)
q3 = np.percentile(cust["age"], 75)
plt.axvline(mean_age, color='red', linestyle='dashed', linewidth=1, label=f'Mean Age: {mean_age:.2f}')
plt.axvline(median_age, color='orange', linestyle='dashed', linewidth=1, label=f'Median Age: {median_age:.2f}')
plt.axvline(q1, color='green', linestyle='dashed', linewidth=1, label=f'Q1: {q1:.2f}')
plt.axvline(q3, color='blue', linestyle='dashed', linewidth=1, label=f'Q3: {q3:.2f}')
plt.title("Customers' Age Distribution with Descriptive Statistics", fontweight="bold", size=12)
plt.xlabel("Age", fontweight="bold", size=12)
plt.ylabel("Count", fontweight="bold", size=12)
plt.legend()
plt.show()
#Age group analysis : Which age group purchase more Products?
bins = [0, 18, 30, 40, 50, 60, 100]
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '60+']
cust['age_groups'] = pd.cut(cust['age'], bins=bins, labels=labels, right=False)
cust['purchase_quantity'] = prod['product_id']
age_group_purchase = cust.groupby('age_groups')['purchase_quantity'].sum().reset_index()
total_purchased = age_group_purchase['purchase_quantity'].sum()
plt.figure(figsize=(8, 6))
plt.title("Identifying the Highest Purchasing Segment", size=15)
sns.barplot(x="age_groups", y="purchase_quantity", data=age_group_purchase, palette="rocket")
plt.xlabel("Age Group", size=12)
plt.ylabel("Total Purchased Quantity", size=12)
plt.xticks(rotation=45)
for index, row in age_group_purchase.iterrows():
plt.text(index, row['purchase_quantity'], f"{(row['purchase_quantity'] / total_purchased) * 100:.2f}%",
color='black', ha="center", fontsize=12)
plt.tight_layout()
plt.show()
#Is there a higher purchasing tendency among active fashion news subscribers?
plt.figure(figsize=(10, 8))
plt.title("Purchased Quantity by Fashion News Frequency", size=14)
g = sns.barplot(x="fashion_news_frequency", y="Purchased Quantity(%)",
data=cust.groupby("fashion_news_frequency")["purchase_quantity"].sum()
.transform(lambda x: (x / x.sum() * 100)).rename('Purchased Quantity(%)').reset_index(),
palette="viridis", edgecolor="black")
plt.xlabel("Fashion News Frequency", size=12)
plt.ylabel("Purchased Quantity (%)", size=12)
for container in g.containers:
g.bar_label(container, padding=5, fmt='%.2f', fontsize=12, color="black")
plt.show()
#There are 34.65% of total purchases made by subscribers to fashion news,
#whereas 64.04% of purchases are made by non-subscribers.
# analyzing fashion news frequency by age group
x, y = 'age_groups', 'fashion_news_frequency'
data_age_news = cust.groupby(x)[y].value_counts(normalize=True)
data_age_news = data_age_news.mul(100)
data_age_news = data_age_news.rename('percent(%)').reset_index()
data_age_news = data_age_news[data_age_news["fashion_news_frequency"].isin(["Regularly","NONE"])]
palette1 = {"Regularly": '#FFA500', "NONE": '#0000FF'}
plt.figure(figsize=(13,9))
plt.title("Fashion News Frequency by age group", size=24)
g=sns.barplot(x="age_groups", y="percent(%)",data=data_age_news, hue="fashion_news_frequency", palette=palette1)
plt.xlabel("Age group", size=18)
plt.ylabel("Percentage of new frequency", size=18)
for container in g.containers:
g.bar_label(container, padding = 5, fmt='%.1f', fontsize=16, color="black")
plt.legend(title='News\nFrequency',bbox_to_anchor=(1.0, 1.0), ncol=1, fancybox=True, shadow=True, fontsize=14,title_fontsize=22)
plt.show()
#
percentages = (cust["club_member_status"].value_counts(normalize=True) * 100).round(2)
plt.figure(figsize=(8, 6))
plt.fill_between(percentages.index, percentages.values, color='teal', alpha=0.4)
plt.plot(percentages.index, percentages.values, color='orange', alpha=0.6, linewidth=2)
plt.title('Club Member Status', fontsize=16)
plt.xlabel('Club Member Status')
plt.ylabel('Percentage')
plt.xticks(rotation=50)
for i, value in enumerate(percentages.values):
plt.text(i, value, f'{value}%', ha='center', va='bottom', fontsize=10, color='black')
plt.tight_layout()
plt.show()
#creating age bin
labels = ["10","20","30","40","50","60","70","80","90","100"]
labels_number = [int(label) for label in labels]
labels = [f"{label}'s" for label in labels]
d = pd.cut(cust["age"],labels_number,include_lowest=True, right=False, labels=labels[:-1])
cust["age_binned"] = pd.Categorical(d , categories=labels[:-1], ordered=True)
customer_binned = cust
cust["age_binned"].value_counts(sort=False)
10's 54589 20's 406615 30's 184870 40's 159307 50's 172985 60's 55597 70's 13482 80's 1056 90's 74 Name: age_binned, dtype: int64
cust.groupby(["age_binned","club_member_status"]).size()
age_binned club_member_status
10's ACTIVE 53589
LEFT CLUB 33
Not Active 89
PRE-CREATE 878
20's ACTIVE 386729
LEFT CLUB 148
Not Active 1332
PRE-CREATE 18406
30's ACTIVE 167011
LEFT CLUB 64
Not Active 1383
PRE-CREATE 16412
40's ACTIVE 143292
LEFT CLUB 34
Not Active 1104
PRE-CREATE 14877
50's ACTIVE 157636
LEFT CLUB 51
Not Active 500
PRE-CREATE 14798
60's ACTIVE 50954
LEFT CLUB 11
Not Active 162
PRE-CREATE 4470
70's ACTIVE 12371
LEFT CLUB 7
Not Active 30
PRE-CREATE 1074
80's ACTIVE 957
LEFT CLUB 0
Not Active 3
PRE-CREATE 96
90's ACTIVE 60
LEFT CLUB 0
Not Active 1
PRE-CREATE 13
dtype: int64
#updating plotly version
!pip install plotly --upgrade
Requirement already satisfied: plotly in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (5.22.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (from plotly) (8.2.3) Requirement already satisfied: packaging in c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages (from plotly) (22.0)
WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages) WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages) WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages) WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages) WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages) WARNING: Ignoring invalid distribution -lotly (c:\users\kpv_2\anaconda3\envs\dab200\lib\site-packages)
# How to Target Different Segments of Customers Based on Their Age and Club Membership?
details = cust.groupby(["age_binned","club_member_status"]).agg({"club_member_status" : ["count"]}).reset_index(drop=False)
col = ["_".join(i) if i[1] != "" else i[0] for i in list(details)]
details.columns = col
custom_palette = px.colors.qualitative.Set3
fig = px.treemap(details, path=['age_binned', 'club_member_status'],
values='club_member_status_count',
color='club_member_status',
color_discrete_sequence=custom_palette)
fig.update_layout(height=600, width=800, title={
'text': 'Club Member Status Distribution across Age Groups',
'x': 0.5,
'y': 0.95
})
fig.show()
trans.head(5)
| t_dat | customer_id | product_id | price | sales_channel_id | |
|---|---|---|---|---|---|
| 0 | 2018-09-20 | 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... | 663713001 | $101.66 | 2 |
| 1 | 2018-09-20 | 000058a12d5b43e67d225668fa1f8d618c13dc232df0ca... | 541518023 | $60.98 | 2 |
| 2 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 505221004 | $30.47 | 2 |
| 3 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 685687003 | $33.86 | 2 |
| 4 | 2018-09-20 | 00007d2de826758b65a93dd24ce629ed66842531df6699... | 685687004 | $33.86 | 2 |
trans.columns
Index(['t_dat', 'customer_id', 'product_id', ' price ', 'sales_channel_id'], dtype='object')
#renaming ' price ' to 'price'
trans.rename(columns={' price ': 'price'}, inplace=True)
trans.columns
Index(['t_dat', 'customer_id', 'product_id', 'price', 'sales_channel_id'], dtype='object')
trans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048575 entries, 0 to 1048574 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 t_dat 1048575 non-null object 1 customer_id 1048575 non-null object 2 product_id 1048575 non-null int64 3 price 1048575 non-null object 4 sales_channel_id 1048575 non-null int64 dtypes: int64(2), object(3) memory usage: 40.0+ MB
#
from termcolor import colored
print(f"Observations in Products dataset: {colored(prod.shape, 'red')}")
print(f"Observations in Customers dataset: {colored(cust.shape, 'green')}")
print(f"Observations in Transactions dataset: {colored(trans.shape, 'blue')}")
Observations in Products dataset: (105540, 15) Observations in Customers dataset: (1048575, 10) Observations in Transactions dataset: (1048575, 5)
print(colored(prod.columns, 'yellow'))
print(colored(cust.columns, 'green'))
print(colored(trans.columns, 'blue'))
Index(['product_id', 'product_code', 'product_type_no', 'product_type_name', 'product_group_name', 'department_no', 'department_name', 'index_code', 'index_group_name', 'graphical_appearance_no', 'graphical_appearance', 'colour_group_code', 'colour_group_name', 'section_no', 'clothing_group_no'], dtype='object') Index(['customer_id', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'age_groups', 'purchase_quantity', 'age_binned'], dtype='object') Index(['t_dat', 'customer_id', 'product_id', 'price', 'sales_channel_id'], dtype='object')
fig = px.histogram(prod, x = 'product_type_name', width = 800,height = 500,title = 'Product Type Distribution')
fig.show()
fig = px.histogram(cust, x = 'age', width = 800,height = 500,title = 'Customer Age Type Distribution')
fig.show()
import plotly.express as px
grouped_data = prod.groupby('product_type_name').size().reset_index(name='prod_count').sort_values('prod_count', ascending=False)
fig = px.bar(grouped_data, x='prod_count', y='product_type_name',
color='prod_count',
width=800, height=2000,
title='Product Type Distribution by Count',
labels={'product_type_name': 'Product Type', 'prod_count': 'Count'})
fig.show()
#
fig = px.bar(cust.groupby(['age']).count().sort_values('club_member_status',ascending=False),x='club_member_status',color ='club_member_status',
width = 800,height = 1000,title = 'Customer Age wise Distribution')
fig.show()
#top 10 customer's by number of Transactions
transactions_byid = trans.groupby('customer_id').count()
transactions_byid.sort_values(by='price', ascending=False)['price'][:10]
customer_id 75c54a755b8a467e53e0a4e01833deb029734feb22ad25438137925123a38f8b 170 560a3cc8223aed0616efdcb227aea89630480a5dd73d4567045fd522a5ce89f6 128 5b8aa448f9eb6235d403b43bf1279813a33527fcdee38de4cf1f674a5badf65b 123 78e25599369c983b39dce7214bf8fc54219646d69e4262046b813da423089cf3 112 8ecbac3466886ba06b611fc52dd86762a661f57a7ccd2a2138ae2439e36d7b37 111 e775206b7df6ce66793edc9ae5d1a3785e65a7c2cd35fedef50ba3cec26db582 110 2039c3a9cdd14183aae75ea2b5e52956e1b5d22b5ff4aa27da504c4f03ac8cd2 108 0152d53f51444891ea07013fd1fb8325415bb09bb6798a59359b21a8326d801b 108 2fdf822dbaad2b983b37e651a982bba24352a92c8a5c4c75be25c771f2af6d13 103 313eec305f72a967c9274266da85c0762fb4774c6da6a9b96f704f49b7b9587d 100 Name: price, dtype: int64
#CUSTOMER SEGMENTATION
trans['transaction_count'] = trans.groupby('customer_id')['price'].transform('count')
cust = cust.merge(trans[['customer_id', 'transaction_count']], on='customer_id', how='left')
# Feature Selection for Customer Segmentation
customer_features = cust[['age', 'club_member_status', 'transaction_count']]
# Handling categorical data for 'club_member_status'
customer_features = pd.get_dummies(customer_features, columns=['club_member_status'], drop_first=True)
# Imputation of Missing Values
imputer = SimpleImputer(strategy='mean')
customer_features_imputed = imputer.fit_transform(customer_features)
# Data Preprocessing
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features_imputed)
# Applying K-means Clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Example with 5 clusters
clusters = kmeans.fit_predict(customer_features_scaled)
cust['cluster'] = clusters
# Evaluating Clusters
score = silhouette_score(customer_features_scaled, clusters)
print(f'Silhouette Score: {score}')
# Visualizing Clusters
plt.scatter(customer_features_scaled[:, 0], customer_features_scaled[:, 1], c=clusters, cmap='viridis')
plt.xlabel('Age')
plt.ylabel('Transaction Frequency')
plt.title('Customer Segments')
plt.show()